#!/usr/bin/python3
# -*- coding: UTF-8 -*-

# Python3 标准库
# https://docs.python.org/zh-cn/3/library/index.html

# 解析HTML
# 1: 正则表达式大法
# 2: requests-html pip install requests-html
# 3: BeautifulSoup pip install beautifulsoup4
# 4: lxml.XPath    pip install lxml
# [](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9pbWcxLnR1aWNvb2wuY29tL25peUlSYkouanBnIXdlYg?x-oss-process=image/format,png)
# 5: SGMLParser
# 6: HTMLParaer

# version: 3.11.4
# 执行
# 创建空白__init__.py


import requests
import os
import sys
import time
import datetime
import html
import random
import math
import re
import json
import urllib
import shutil
import glob
import zlib
import doctest
import hashlib
from pathlib import Path
from io import StringIO
from bs4 import BeautifulSoup


def getdocumentfromurl(url):
    request = urllib.request.request(url)
    # request.add_header('Host','www.biququ.la')
    # request.add_header('Referer','https://www.biququ.la/html/27744/')
    request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0')
    response = urllib.request.urlopen(request)
    response.encoding = 'utf-8'
    if response.code == 200:
        pass
    else:
        print("访问%s失败，返回码为:%s", url, response.code)
        pass
    # utf-8解码，得到中文
    html_string = html.unescape(response.read().decode('utf-8'))
    document = BeautifulSoup(html_string, 'html.parser')
    return document


pass


def parse(document):
    chapter = ""
    title = document.body.select("div.bookname h1")[0].text
    chapter += title
    chapter += "\n"
    content_list = document.body.select("div#content p")
    for content in content_list:
        chapter += content.text
        chapter += "\n"
        pass
    chapter += "\n"
    print(chapter)
    return chapter


pass


def write2file(chapter, file):
    fo = open(file, "a+", "utf8")
    fo.write(chapter)
    fo.close


pass


def webcrawler():
    file = "星辰变-biququ.txt"
    for i in range(587, 681):
        url_format = "https://www.biququ.la/html/27744/34680%d.html"
        url = url_format % i
        document = getdocumentfromurl(url);
        chapter = parse(document)
        write2file(chapter, file)
        pass
    print("finish")


pass

webcrawler()

# biququ
# start = "https://www.biququ.la/html/27744/346801.html"
# end = "https://www.biququ.la/html/27744/34680681.html"
# qidian
# catalogUrl = "https://book.qidian.com/info/118447/#Catalog"
